{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "from sklearn.preprocessing import normalize\n",
    "from sklearn.cluster import MiniBatchKMeans\n",
    "\n",
    "from sklearn import metrics"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 读取数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>...</th>\n",
       "      <th>5306</th>\n",
       "      <th>5307</th>\n",
       "      <th>5308</th>\n",
       "      <th>5309</th>\n",
       "      <th>5310</th>\n",
       "      <th>5311</th>\n",
       "      <th>5312</th>\n",
       "      <th>5313</th>\n",
       "      <th>5314</th>\n",
       "      <th>5315</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 5316 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     0    1    2    3    4    5    6    7    8    9  ...   5306  5307  5308  \\\n",
       "0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0   0.0   0.0   \n",
       "1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0   0.0   0.0   \n",
       "2  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0   0.0   0.0   \n",
       "3  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0   0.0   0.0   \n",
       "4  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...    0.0   0.0   0.0   \n",
       "\n",
       "   5309  5310  5311  5312  5313  5314  5315  \n",
       "0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  \n",
       "1   0.0   0.0   0.0   0.0   0.0   0.0   0.0  \n",
       "2   0.0   0.0   0.0   0.0   0.0   0.0   0.0  \n",
       "3   0.0   0.0   0.0   0.0   0.0   0.0   0.0  \n",
       "4   0.0   0.0   0.0   0.0   0.0   0.0   0.0  \n",
       "\n",
       "[5 rows x 5316 columns]"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train = pd.read_csv('FE_train_tfidf_1.csv')\n",
    "y_train = train['label']\n",
    "X_train = train.drop('label', axis=1)\n",
    "X_train.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 模长归一"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "       ..., \n",
       "       [ 0.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0., ...,  0.,  0.,  0.],\n",
       "       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "normalize(X_train, norm='l2', copy=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def K_cluster_analysis(K, X):\n",
    "    print('K-means begin with clusters:{}'.format(K))\n",
    "    \n",
    "    mb_kmeans = MiniBatchKMeans(n_clusters = K)\n",
    "    y_pred = mb_kmeans.fit_predict(X)\n",
    "    \n",
    "    CH_score = metrics.calinski_harabaz_score(X, y_pred)\n",
    "    \n",
    "    print('CH_score:{0}'.format(CH_score))\n",
    "    \n",
    "    return CH_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "K-means begin with clusters:5\n",
      "CH_score:34.680074740623596\n",
      "K-means begin with clusters:10\n",
      "CH_score:25.438330777254592\n",
      "K-means begin with clusters:15\n",
      "CH_score:21.409098570230682\n",
      "K-means begin with clusters:20\n",
      "CH_score:16.4638650722022\n",
      "K-means begin with clusters:30\n",
      "CH_score:13.844658700697373\n",
      "K-means begin with clusters:40\n",
      "CH_score:10.483470884594581\n",
      "K-means begin with clusters:50\n",
      "CH_score:10.038197229038303\n"
     ]
    }
   ],
   "source": [
    "# 设置超参数(聚类数目K) 搜索范围\n",
    "\n",
    "Ks = [5, 10, 15, 20, 30, 40, 50]\n",
    "\n",
    "CH_scores = []\n",
    "\n",
    "for K in Ks:\n",
    "    ch = K_cluster_analysis(K, X_train)\n",
    "    CH_scores.append(ch)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "K-means begin with clusters:2\n",
      "CH_score:49.81583573398776\n",
      "K-means begin with clusters:3\n",
      "CH_score:48.09562828945313\n",
      "K-means begin with clusters:4\n",
      "CH_score:35.313285497776945\n",
      "K-means begin with clusters:5\n",
      "CH_score:39.0665683825024\n",
      "K-means begin with clusters:6\n",
      "CH_score:34.558699652193326\n",
      "K-means begin with clusters:7\n",
      "CH_score:26.370779177207055\n",
      "K-means begin with clusters:8\n",
      "CH_score:26.654982587277267\n",
      "K-means begin with clusters:9\n",
      "CH_score:26.503505824702724\n"
     ]
    }
   ],
   "source": [
    "# 设置超参数(聚类数目K) 搜索范围\n",
    "\n",
    "Ks = [2, 3, 4, 5, 6, 7, 8, 9]\n",
    "\n",
    "CH_scores = []\n",
    "\n",
    "for K in Ks:\n",
    "    ch = K_cluster_analysis(K, X_train)\n",
    "    CH_scores.append(ch)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Best_K:2\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAD8CAYAAABn919SAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4wLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvpW3flQAAHJVJREFUeJzt3XmYVNWdxvHvD7pZBBTDMiIdRRzF\nbRS1RR1wYXOJ+4bGDVGHwiQO6jMRt7gb4iSKcYwSBIREDOIWl0gGR3FLXNIoIEqMGyiKARdEQJHl\nN3+c4hGV7i6gqs69t97P89TTC9V9X/qBt26fe+455u6IiEj6NYkdQEREikOFLiKSESp0EZGMUKGL\niGSECl1EJCNU6CIiGaFCFxHJCBW6iEhGqNBFRDKiqpwHa9++vXfp0qWchxQRSb1p06Z95O4dGnte\nWQu9S5cu1NXVlfOQIiKpZ2ZzC3mehlxERDJChS4ikhEqdBGRjChoDN3M5gCfA6uAle5ea2bfA+4G\nugBzgAHu/mlpYoqISGPW5wy9t7t3d/fa/McXAY+7+3bA4/mPRUQkko0ZcjkKGJ9/fzxw9MbHERGR\nDVVooTswxcymmdng/Of+xd3nA+TfdixFQBERKUyh89B7uvsHZtYReMzM/l7oAfIvAIMBttpqqw2I\nCHfeCcuWwTHHQIdGp9aLiFSmgs7Q3f2D/NsFwANAD+CfZtYJIP92QT1fO8rda929tsMGtvHEiZDL\nQadO0L8/jBoFCxdu0LcSEcmsRgvdzFqZWZs17wMHAbOAh4CB+acNBB4sVciHH4aXX4YLL4Q5c1Tu\nIiLrYu7e8BPMuhLOyiEM0dzl7teZWTtgErAV8C5wgrt/0tD3qq2t9Y299d8dZsyASZPgnnvgzTeh\naVPo3RtOOEHDMiKSPWY2ba0ZhvU/r7FCL6ZiFPraGir3AQNCubdvX7TDiYhEURGFvjZ3mD49FLvK\nXUSypOIKfW31lXufPl8Py6jcRSQtKrrQ17Z2uU+aBG+9pXIXkXRRoa+Dyl1E0kiF3oiGyn3AADj6\naJW7iCSDCn09rCn3NbNlvl3uxxwD7drFTikilUqFvoHqK/e+fb8ellG5i0g5qdCLwD3cobpmtozK\nXURiUKEXmcpdRGJRoZfQ2uU+aRK8/fbX5b7mgqrKXUSKRYVeJusq9xYt4NlnYc89Y6cTkSwotNC1\nSfRGMoM99oDhw8MdqXV10KwZ3HRT7GQiUmlU6EVkFs7KTz01nLF//HHsRCJSSVToJZDLwfLl8Lvf\nxU4iIpVEhV4Cu+4K++4Lv/1tGGMXESkHFXqJ5HLw+uvw1FOxk4hIpVChl8iAAdC2bThLFxEpBxV6\nibRsCQMHwn33wYJ1bp8tIlJcKvQSyuVgxQoYNy52EhGpBCr0EtpxR9hvPxg1Clavjp1GRLJOhV5i\nuVxY9+WJJ2InEZGsU6GX2HHHhXVdRo6MnUREsk6FXmItWsAZZ8CDD8KHH8ZOIyJZpkIvg8GDYeVK\nGDs2dhIRyTIVehlsv33Yzu7222HVqthpRCSrVOhlksvBnDkwZUrsJCKSVSr0Mjn6aOjYUXeOikjp\nqNDLpFkzGDQIHnkE5s2LnUZEskiFXkaDB4cx9DFjYicRkSxSoZdR165w0EEwenSY9SIiUkwq9DLL\n5cKQy+TJsZOISNao0MvsiCOgUyddHBWR4lOhl1l1NZx1Fjz6KMydGzuNiGRJwYVuZk3N7GUzeyT/\n8Tgze8fMpucf3UsXM1vOPju8HT06bg4RyZb1OUMfCsz+1ud+6u7d84/pRcyVaVtvDYceGma7rFgR\nO42IZEVBhW5mNcBhgM4pi2TIEJg/Hx5+OHYSEcmKQs/QbwIuBL69TcN1ZjbTzEaYWfPiRsu2Qw+F\nmhpdHBWR4mm00M3scGCBu0/71h9dDOwA7AV8DxhWz9cPNrM6M6tbuHDhxubNjKqqMJY+ZQq8/Xbs\nNCKSBYWcofcEjjSzOcBEoI+Z3enu8z1YDtwB9FjXF7v7KHevdffaDh06FC14Fpx1FjRpElZhFBHZ\nWI0Wurtf7O417t4FOAl4wt1PNbNOAGZmwNHArJImzaCamjAvfexY+Oqr2GlEJO02Zh76BDN7BXgF\naA9cW5xIlSWXgwUL4I9/jJ1ERNLO3L1sB6utrfW6urqyHS8NVq2CbbcNj8cfj51GRJLIzKa5e21j\nz9OdopE1bRpWYXziCfjHP2KnEZE0U6EnwJlnhlkvo0bFTiIiaaZCT4AttoCjjoJx4+DLL2OnEZG0\nUqEnRC4HH38M998fO4mIpJUKPSH69g0XRkeOjJ1ERNJKhZ4QTZqEi6PPPAOvvRY7jYikkQo9Qc44\nI6yXroujIrIhVOgJ0rEjHHccjB8PX3wRO42IpI0KPWFyOVi0CCZNip1ERNJGhZ4wBxwA3bppWV0R\nWX8q9IQxCxdHn3sOZs6MnUZE0kSFnkADB0Lz5jpLF5H1o0JPoHbt4IQT4M47YenS2GlEJC1U6AmV\ny8HixTBxYuwkIpIWKvSE6tkTdt5Zwy4iUjgVekKZhbP0v/0NXnopdhoRSQMVeoKddhq0bKmzdBEp\njAo9wdq2hRNPhLvugs8/j51GRJJOhZ5wQ4bAkiUwYULsJCKSdCr0hOvRA3bbLQy7lHH7VxFJIRV6\nwq25ODp9erhAKiJSHxV6CpxyCrRqpYujItIwFXoKbLopnHwy/OEPYSVGEZF1UaGnRC4X1ki/887Y\nSUQkqVToKbHnnuGhi6MiUh8VeooMGQKzZsFf/xo7iYgkkQo9RU46Cdq00cVREVk3FXqKtG4Np54a\ntqf75JPYaUQkaVToKZPLwfLl8LvfxU4iIkmjQk+Z3XaDffaBkSN1cVREvkmFnkK5HLz+Ojz9dOwk\nIpIkKvQUGjAANttMF0dF5JtU6Cm0ySZhI+n77oOFC2OnEZGkKLjQzaypmb1sZo/kP97GzF4wszfM\n7G4za1a6mPJtuRx89RWMGxc7iYgkxfqcoQ8FZq/18fXACHffDvgUOKuYwaRhO+0EvXrBqFGwenXs\nNCKSBAUVupnVAIcBo/MfG9AHuDf/lPHA0aUIKPUbMgTefBOmTo2dRESSoNAz9JuAC4E154LtgEXu\nvjL/8Tygc5GzSSOOOw7atQtTGEVEGi10MzscWODu09b+9Dqeus5Z0WY22MzqzKxuoa7gFVWLFuHi\n6B//CB9+GDuNiMRWyBl6T+BIM5sDTCQMtdwEtDWzqvxzaoAP1vXF7j7K3WvdvbZDhw5FiCxrGzwY\nVq6EO+6InUREYmu00N39YnevcfcuwEnAE+5+CjAVOD7/tIHAgyVLKfXq1g1699bFURHZuHnow4AL\nzOxNwpj6mOJEkvWVy8GcOTBlSuwkIhKTeRkXBKmtrfW6urqyHa9SfPUV1NRAz57wwAOx04hIsZnZ\nNHevbex5ulM0A5o1g0GD4OGH4f33Y6cRkVhU6BkxeDCsWgVjNPAlUrFU6Bmx7bbQvz+MHh2KPYvu\nvht69IAnnoidRCSZVOgZksvBe+/B5MmxkxTX0qVw9tlhC75XXoFDDoHf/z52KpHkUaFnyJFHwhZb\nZGtZ3enTYc89YexYuPRSmDcP9tsPTj8drrlGm3yIrE2FniHV1XDWWfDoo/Duu7HTbBx3uPlm2Htv\n+PxzePxxuPbasNTB5Mmh0C+/PJy5r1gRO61IMqjQM+Y//iOU4ejRsZNsuI8+Cr9tDB0KBx8MM2aE\nm6fWaNYsLBv8s5+FM/fDD4fFi6PFFUkMFXrGbL11GGMePTqdZ65Tp4Z9U6dMCWfoDz4I7dt/93lm\ncPXVYVbP44/D/vtryqaICj2DhgyB+fPhkUdiJyncihVw2WXQty+0aQMvvgjnnhuKuyFnngl/+hO8\n9VbYPPuVV8qTVySJVOgZ9IMfQOfO6bk4OmcOHHAAXHddKOhp08JZeqEOPhieeSasZdOrVzhjF6lE\nKvQMqqoKFwunTIF33omdpmGTJkH37vDqqzBxYhgqatVq/b9P9+7w/PNfDzmNH1/8rCJJp0LPqLPP\nDsMVt98eO8m6LV0aLuCeeCLsuGOYnnjiiRv3Pb///XCmfuCBcMYZYYxd0xqlkqjQM6qmJsz+GDMm\nLN6VJDNmQG1tyHbJJfD007DNNsX53pttFsbUBw6EK64IQzhpvDgssiFU6BmWy8GCBWGmSBK4wy23\nhLnln30Gjz0Wxs2rq4t7nGbNwoYfV1wRpjcedpimNUplUKFn2MEHw1ZbJePi6EcfwVFHhZkr/fqF\ns/S+fUt3PDO48sowT33q1HB36bx5pTueSBKo0DOsadOwCuPjj8Mbb8TL8eSTYdbK//4v3HRTWOa3\nXLsRDhoU7px9550wrXHmzPIcVyQGFXrGnXlmmPUyalT5j71yZbibs08faN06zEIZOrTxueXF1r8/\nPPtseL9XrzDUI5JFKvSM69Qp3EY/bhwsX16+486dG+aWX3ttOEueNg123718x/+2XXcNLyjbbBPm\n6WtTbckiFXoFGDIkjGHff395jnfPPWGIZdYsuOuuMJuldevyHLshNTVhWmPv3uE3lyuv1LRGyRYV\negXo2xe6doWRI0t7nGXLwpj9gAGwww7w8svwwx+W9pjra9NNw7TGQYPgqqvC26RN6xTZUCr0CtCk\nSSjap5+G2bNLc4yZM8Pc8tGj4aKLwplw166lOdbGqq4OvzVcdVW4o/Sww8I0SpG0U6FXiEGDQpEV\n++KoO/zmN2FruE8/DcsNDB9e/LnlxWYW1lMfNy7MwtG0RskCFXqF6NgRjj02nJF+8UVxvufHH8Mx\nx8BPfhKGdWbODHPM02TgwLBhxty54YanGTNiJxLZcCr0CpLLhbPoe+7Z+O/11FPhwuejj8KIEWGp\n3nLNLS+2fv3CtMYmTcKZ+pQpsROJbBgVegU58EDYfvuNu3N05cpwS32fPrDJJmEq4HnnlX9uebH9\n27+Fv0vXrmFa49ixsROJrD8VegUxC2fpf/3rhm0EMXdueFG4+uqwp+dLL8EeexQ9ZjSdO4cLx/36\nhb1ZL79c0xolXVToFWbgQGjefP3P0u+7L6w5PnMmTJgQbsxJwtzyYtt007A0wVlnwTXXhJ+XpjVK\nWqjQK0y7dnD88fD734c1yRuzbFk4qz/+eNhuuzC3/OSTS58zpurqsI78NdeEn9Ohh8KiRbFTiTRO\nhV6BcrmwnOzddzf8vFmzwnTEUaNg2LBw4XDbbcuTMTazsMfp+PFhGKZXL3jvvdipRBqmQq9AvXrB\nTjvVP+ziDrfdBnvtFZYMmDIFfvGLsM54pTn9dPjzn0OZ77NP2FlJJKlU6BVozcXRF18MQyhr++ST\nMF/9Rz8KF0BnzgyrFVayvn3hL38JyxHvt18oeJEkUqFXqNNOgxYtvnmW/vTTYW75n/4EN9wQ3nbs\nGC9jkuyyS5jW+K//Grb2Gz06diKR71KhV6jNN4eTTgozVhYtCisP9u4dSv655+CCC8KNNvK1LbcM\nL3r9+4cNri+7TNMaJVn0X7aC5XKwZAnsvHNYqOq008Lc8j33jJ0sudq0gYcegrPPDvuhnn66pjVK\ncjRa6GbWwsxeNLMZZvaqmV2V//w4M3vHzKbnH91LH1eKae+9w41BixfDnXeGharatImdKvnWLHJ2\n3XXh53bIIZrWKMlQVcBzlgN93H2JmVUDz5rZ5Pyf/dTd7y1dPCkls7DP5+rVGitfX2ZwySVhE+4z\nz4SePcMiX1ttFTuZVLJGz9A9WJL/sDr/0MhhRrRvrzLfGKeeGl4U338/TGv89qwhkXIqaAzdzJqa\n2XRgAfCYu7+Q/6PrzGymmY0ws+b1fO1gM6szs7qFCxcWKbZIcvTuHaY1VleHaY2TJzf+NSKlUFCh\nu/sqd+8O1AA9zGwX4GJgB2Av4HvAsHq+dpS717p7bYe0rq8q0oiddw7TGrffHo44ovgbiYgUYr1m\nubj7IuBJ4BB3n58fjlkO3AH0KEE+kdTo1ClMazzooDCD6NJLNa1RyquQWS4dzKxt/v2WQD/g72bW\nKf85A44GZpUyqEgatG4dpjUOHgw//zn853/GTiSVpJBZLp2A8WbWlPACMMndHzGzJ8ysA2DAdGBI\nCXOKpEZVFYwcGZYp/p//CStVHnBA7FRSCczL+DthbW2t19XVle14IjEtWxaWDGjWLCzq1aJF7ESS\nVmY2zd1rG3ue7hQVKZFNNgln6q+/DsOHx04jlUCFLlJCBx0UNgQZPhxmz46dRrJOhS5SYiNGhIul\nuVy4K1ekVFToIiXWsSP88pfwzDMwdmzsNJJlKnSRMjjzzDDT5ac/hQ8/jJ1GskqFLlIGZmEzkWXL\n4PzzY6eRrFKhi5RJt25hhcaJE7WNnZSGCl2kjC66CHbYAc45B5YujZ1GskaFLlJGzZuHoZc5c8Iu\nUSLFpEIXKbP99w9b2N14Y7iDVKRYVOgiEfz3f0O7dmGz6VWrYqeRrFChi0Sw+eZw001QVwe/+U3s\nNJIVKnSRSE46KWwwfeml8N57sdNIFqjQRSIxg1tvDUMuP/mJNsOQjadCF4lom23CbJeHHoIHHoid\nRtJOhS4S2XnnwW67wbnnwmefxU4jaaZCF4msuhpuvx3mzw/j6SIbSoUukgB77RXO0G+9FZ5/PnYa\nSSsVukhCXHstdO4c5qavWBE7jaSRCl0kIdq0gVtugVmz4IYbYqeRNFKhiyTIUUfBsceGmS9vvRU7\njaSNCl0kYW6+OVwoPecczU2X9aNCF0mYzp3DptKPPQYTJsROI2miQhdJoCFDYO+9w+5GH38cO42k\nhQpdJIGaNoVRo2DRorAPqUghVOgiCbXrrvBf/wV33AFPPhk7jaSBCl0kwS6/HLp2hVwOvvwydhpJ\nOhW6SIK1bAkjR8I//gE//3nsNJJ0KnSRhOvfH045BX7xC3jttdhpJMlU6CIpcOON4U7SXA5Wr46d\nRpJKhS6SAh07wq9+Bc8+C2PGxE4jSaVCF0mJM86AAw8M0xg//DB2GkkiFbpISpiFC6RffBE2xRD5\ntkYL3cxamNmLZjbDzF41s6vyn9/GzF4wszfM7G4za1b6uCKVrVs3uOwyuPtumDw5dhpJmkLO0JcD\nfdx9N6A7cIiZ7QNcD4xw9+2AT4GzShdTRNYYNgx23DEs3rV0aew0kiSNFroHS/IfVucfDvQB7s1/\nfjxwdEkSisg3NGsWlgWYOxeuvDJ2GkmSgsbQzaypmU0HFgCPAW8Bi9x9Zf4p84DO9XztYDOrM7O6\nhQsXFiOzSMXr1SvsbDRiBLz8cuw0khQFFbq7r3L37kAN0APYcV1Pq+drR7l7rbvXdujQYcOTisg3\nXH89tG8PgwfDqlWx00gSrNcsF3dfBDwJ7AO0NbOq/B/VAB8UN5qINGTzzeHXv4a6urB1nUghs1w6\nmFnb/PstgX7AbGAqcHz+aQOBB0sVUkTWbcAAOPTQMPPlvfdip5HYCjlD7wRMNbOZwN+Ax9z9EWAY\ncIGZvQm0A3T/mkiZmcGtt4blAH78Y21ZV+mqGnuCu88Edl/H598mjKeLSERdusDVV4e10++/H447\nLnYiiUV3iopkwNChsPvucO658NlnsdNILCp0kQyoqgpz0//5T7jkkthpJBYVukhG1NaGM/TbboPn\nnoudRmJQoYtkyDXXQOfOYW76ihWx00i5qdBFMqRNmzDrZdassH66VBYVukjGHHFEmOly9dXw1lux\n00g5qdBFMujmm8MiXkOGaG56JVGhi2TQllvC8OHwf/8HEybETiPlokIXyaghQ2DffeH88+Gjj2Kn\nkXJQoYtkVJMmYW76okVhH1LJPhW6SIbtskso83HjYOrU2Gmk1FToIhn3s5/BtttCLgdffhk7jZSS\nCl0k41q2hJEj4Y034LrrYqeRUlKhi1SAfv3gtNPCLkevvRY7jZSKCl2kQtxwA2y6aVgWYPXq2Gmk\nFFToIhWiQ4ewHMBf/gKjR8dOI6WgQhepIAMHQu/ecOGFMH9+7DRSbCp0kQpiBr/9bZjtct55sdNI\nsanQRSrMdtuFTaUnTYJHH42dRopJhS5SgS68EHbaCX70I1i6NHYaKRYVukgFatYsLAswdy5ccUXs\nNFIsVbEDiEgcPXuGu0dHjICTT4Y99oidqH7usGwZLF4cNsFevDh8rnlzaNFi3W+bNo2duvzMy7hY\ncm1trdfV1ZXteCLSsEWLYMcdw7Z1zz8fNpsuptWrw5DO2kXc0KOh56zv3PmqqvoLv74XgWK8LcUL\ni5lNc/faRv/OG38oEUmrtm3h17+GE0+EW275eubLqlWwZElhRdvQcz7/vLANNlq1Cjc9rf3YYovv\nfm7No02bUJRffgnLl2/Y2y+/hE8+afg5xTjfXfPCUlcHO+yw8d+vwWOV9tuLSNKdcAKMHw/DhsEv\nfxmKeMmSwr62TZvvlm1NTf1FvOax2WZfv9+6dfF/MygGd1i5sv4Xg/V9AWnXrvSZE/hjFJFyMoPb\nb4errgrDGoWU8JoibpLhaRVmUF0dHm3axE5TGBW6iLDlluGGI0m3DL++iohUFhW6iEhGqNBFRDJC\nhS4ikhEqdBGRjFChi4hkhApdRCQjVOgiIhlR1sW5zGwhMHcDv7w98FER45RamvIqa+mkKW+askK6\n8m5s1q3dvUNjTyproW8MM6srZLWxpEhTXmUtnTTlTVNWSFfecmXVkIuISEao0EVEMiJNhT4qdoD1\nlKa8ylo6acqbpqyQrrxlyZqaMXQREWlYms7QRUSkAYkvdDP7vplNNbPZZvaqmQ2Nnak+ZtbCzF40\nsxn5rFfFztQYM2tqZi+b2SOxszTGzOaY2StmNt3MEr85rZm1NbN7zezv+X+/+8bOtC5m1i3/M13z\nWGxm58XOVR8zOz///2uWmf3BzFrEztQQMxuaz/pqqX+uiR9yMbNOQCd3f8nM2gDTgKPd/bXI0b7D\nzAxo5e5LzKwaeBYY6u7PR45WLzO7AKgFNnX3w2PnaYiZzQFq3T0Vc4/NbDzwjLuPNrNmwCbuvih2\nroaYWVPgfWBvd9/Qe0ZKxsw6E/5f7eTuX5jZJOBRdx8XN9m6mdkuwESgB/AV8GfgHHd/oxTHS/wZ\nurvPd/eX8u9/DswGOsdNtW4erNmNsTr/SOwrppnVAIcBo2NnyRoz2xTYHxgD4O5fJb3M8/oCbyWx\nzNdSBbQ0sypgE+CDyHkasiPwvLsvc/eVwFPAMaU6WOILfW1m1gXYHXghbpL65YcwpgMLgMfcPbFZ\ngZuAC4HVsYMUyIEpZjbNzAbHDtOIrsBC4I78kNZoM2sVO1QBTgL+EDtEfdz9feBXwLvAfOAzd58S\nN1WDZgH7m1k7M9sE+AHw/VIdLDWFbmatgfuA89x9cew89XH3Ve7eHagBeuR/5UocMzscWODu02Jn\nWQ893X0P4FDgx2a2f+xADagC9gBuc/fdgaXARXEjNSw/LHQkcE/sLPUxs82Bo4BtgC2BVmZ2atxU\n9XP32cD1wGOE4ZYZwMpSHS8VhZ4fj74PmODu98fOU4j8r9dPAodEjlKfnsCR+XHpiUAfM7szbqSG\nufsH+bcLgAcI45JJNQ+Yt9ZvaPcSCj7JDgVecvd/xg7SgH7AO+6+0N1XAPcD/x45U4PcfYy77+Hu\n+wOfACUZP4cUFHr+QuMYYLa73xg7T0PMrIOZtc2/35Lwj+/vcVOtm7tf7O417t6F8Gv2E+6e2DMd\nM2uVvyhOfujiIMKvs4nk7h8C75lZt/yn+gKJu5D/LT8kwcMtee8C+5jZJvlu6Eu4rpZYZtYx/3Yr\n4FhK+DOuKtU3LqKewGnAK/mxaYBL3P3RiJnq0wkYn58p0ASY5O6Jnw6YEv8CPBD+D1MF3OXuf44b\nqVHnAhPyQxlvA4Mi56lXfny3P5CLnaUh7v6Cmd0LvEQYuniZ5N8xep+ZtQNWAD92909LdaDET1sU\nEZHCJH7IRURECqNCFxHJCBW6iEhGqNBFRDJChS4ikhEqdBGRjFChi4hkhApdRCQj/h91bUeFVj+F\n/QAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x7fa757a3c780>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.plot(Ks, np.array(CH_scores), 'b-', label='CH_scores')\n",
    "\n",
    "index = np.unravel_index(np.argmax(CH_scores, axis=None), len(CH_scores))\n",
    "Best_K = Ks[index[0]]\n",
    "\n",
    "print('Best_K:{0}'.format(Best_K))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 这里只分成了2类,原因可能是\n",
    "    - 很多对公司描述中常用词重复但不重要的词导致,处理方法:可以先分析一下分词结果,增加停用词\n",
    "    - 很多词比较重要但是出现较少"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
